Data Description

Reference: https://www.kaggle.com/c/web-traffic-time-series-forecasting/data

I have cleaned the kaggle wikipedia traffic data and selected only data of 2016 with fraction of 0.1.

The data was melted and additional columns were created.

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (12,8)})
plt.style.use('fivethirtyeight') # better than sns styles.
matplotlib.rcParams['figure.figsize'] = 12,8

import os
import time

# random state
random_state=100
np.random.seed(random_state)

# Jupyter notebook settings for pandas
#pd.set_option('display.float_format', '{:,.2g}'.format) # numbers sep by comma
from pandas.api.types import CategoricalDtype
np.set_printoptions(precision=3)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100) # None for all the rows
pd.set_option('display.max_colwidth', 200)

import IPython
from IPython.display import display, HTML, Image, Markdown

print([(x.__name__,x.__version__) for x in [np, pd,sns,matplotlib]])
[('numpy', '1.16.4'), ('pandas', '0.25.2'), ('seaborn', '0.9.0'), ('matplotlib', '3.1.1')]
In [2]:
import dask
import dask.dataframe as dd
import gc
In [3]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

Useful Scripts

In [4]:
def show_method_attributes(method, ncols=7,start=None):
    """ Show all the attributes of a given method.
    Example:
    ========
    show_method_attributes(list)
     """
    x = [I for I in dir(method) if I[0]!='_' ]
    x = [I for I in x 
         if I not in 'os np pd sys time psycopg2'.split() ]
    if start:
        x = [I for I in x if I.startswith(start)]

    return pd.DataFrame(np.array_split(x,ncols)).T.fillna('')

Load the data

In [5]:
df = pd.read_csv('../../data/wiki/processed/data_cleaned_2016_frac01.csv',
                 parse_dates=['date'])

print(df.shape) # 5.3 million rows, 21 cols
df.head()
(5309196, 21)
Out[5]:
Page date visits year month day quarter dayofweek dayofyear day_name month_name weekend weekday mean median name project access agent lang language
0 Sean_Connery_en.wikipedia.org_desktop_all-agents 2016-01-01 4872 2016 1 1 1 4 1 Friday January False True 3405.661202 2624.0 Sean_Connery en.wikipedia.org desktop all-agents en English
1 Tableau_des_médailles_des_Jeux_olympiques_d'été_de_2008_fr.wikipedia.org_desktop_all-agents 2016-01-01 6 2016 1 1 1 4 1 Friday January False True 170.841530 18.0 Tableau_des_médailles_des_Jeux_olympiques_d'été_de_2008 fr.wikipedia.org desktop all-agents fr French
2 The_Undertaker_fr.wikipedia.org_mobile-web_all-agents 2016-01-01 469 2016 1 1 1 4 1 Friday January False True 400.336066 345.5 The_Undertaker fr.wikipedia.org mobile-web all-agents fr French
3 Category:Outdoor_sex_commons.wikimedia.org_all-access_all-agents 2016-01-01 142 2016 1 1 1 4 1 Friday January False True 205.174863 193.0 Category:Outdoor_sex commons.wikimedia.org all-access all-agents commons Media
4 Камызяк_ru.wikipedia.org_all-access_all-agents 2016-01-01 6692 2016 1 1 1 4 1 Friday January False True 912.516393 559.0 Камызяк ru.wikipedia.org all-access all-agents ru Russian

Memory Reduction

In [6]:
df.dtypes
Out[6]:
Page                  object
date          datetime64[ns]
visits                 int64
year                   int64
month                  int64
day                    int64
quarter                int64
dayofweek              int64
dayofyear              int64
day_name              object
month_name            object
weekend                 bool
weekday                 bool
mean                 float64
median               float64
name                  object
project               object
access                object
agent                 object
lang                  object
language              object
dtype: object
In [7]:
df.memory_usage(deep=True).sum() * 1e-6 # MB
Out[7]:
4069.15316
In [8]:
# all the year is 2016,drop it.

df.drop('year',axis=1,inplace=True)
In [9]:
cols_int = ['visits']
cols_cat = ['month','day','quarter','day_name','month_name',
            'project','access','agent','language']

cols_float = ['mean','median']

for c in cols_int:
    df[c] = df[c].astype(np.int32)
    
for c in cols_float:
    df[c] = df[c].astype(np.float32)
    

for c in cols_cat:
    df[c] = df[c].astype(pd.api.types.CategoricalDtype())
In [10]:
# make day_name and month_name ordered categorical
df['day_name'].unique()
Out[10]:
[Friday, Saturday, Sunday, Monday, Tuesday, Wednesday, Thursday]
Categories (7, object): [Friday, Saturday, Sunday, Monday, Tuesday, Wednesday, Thursday]
In [11]:
cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

df['day_name'] = pd.Categorical(df['day_name'], ordered=True, categories=cats)

df['day_name'].unique()
Out[11]:
[Friday, Saturday, Sunday, Monday, Tuesday, Wednesday, Thursday]
Categories (7, object): [Monday < Tuesday < Wednesday < Thursday < Friday < Saturday < Sunday]
In [12]:
df['month_name'].unique()
Out[12]:
[January, February, March, April, May, ..., August, September, October, November, December]
Length: 12
Categories (12, object): [January, February, March, April, ..., September, October, November, December]
In [13]:
show_method_attributes(df['month_name'].unique())
Out[13]:
0 1 2 3 4 5 6
0 T check_for_ordered fillna map notnull repeat take
1 add_categories codes from_codes max ordered searchsorted take_nd
2 argsort copy get_values memory_usage put set_categories to_dense
3 as_ordered describe is_dtype_equal min ravel set_ordered to_list
4 as_unordered dropna isin mode remove_categories shape unique
5 astype dtype isna nbytes remove_unused_categories shift value_counts
6 base equals isnull ndim rename_categories size view
7 categories factorize itemsize notna reorder_categories sort_values
In [14]:
df['month_name'].unique().categories
Out[14]:
Index(['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August', 'September', 'October', 'November', 'December'],
      dtype='object')
In [15]:
cats = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August', 'September', 'October', 'November', 'December']

df['month_name'] = pd.Categorical(df['month_name'], ordered=True, categories=cats)

df['month_name'].unique()
Out[15]:
[January, February, March, April, May, ..., August, September, October, November, December]
Length: 12
Categories (12, object): [January < February < March < April ... September < October < November < December]
In [16]:
df.memory_usage(deep=True).sum() * 1e-6 # MB
Out[16]:
1777.2233549999999
In [17]:
for c in cols_cat:
    print(c)
    print(df[c].value_counts().sort_index())
    print()
month
1     449686
2     420674
3     449686
4     435180
5     449686
6     435180
7     449686
8     449686
9     435180
10    449686
11    435180
12    449686
Name: month, dtype: int64

day
1     174072
2     174072
3     174072
4     174072
5     174072
6     174072
7     174072
8     174072
9     174072
10    174072
11    174072
12    174072
13    174072
14    174072
15    174072
16    174072
17    174072
18    174072
19    174072
20    174072
21    174072
22    174072
23    174072
24    174072
25    174072
26    174072
27    174072
28    174072
29    174072
30    159566
31    101542
Name: day, dtype: int64

quarter
1    1320046
2    1320046
3    1334552
4    1334552
Name: quarter, dtype: int64

day_name
Monday       754312
Tuesday      754312
Wednesday    754312
Thursday     754312
Friday       768818
Saturday     768818
Sunday       754312
Name: day_name, dtype: int64

month_name
January      449686
February     420674
March        449686
April        435180
May          449686
June         435180
July         449686
August       449686
September    435180
October      449686
November     435180
December     449686
Name: month_name, dtype: int64

project
commons.wikimedia.org    390888
de.wikipedia.org         688080
en.wikipedia.org         863028
es.wikipedia.org         506544
fr.wikipedia.org         637938
ja.wikipedia.org         756156
ru.wikipedia.org         566568
www.mediawiki.org        262422
zh.wikipedia.org         637572
Name: project, dtype: int64

access
all-access    2705106
desktop       1272216
mobile-web    1331874
Name: access, dtype: int64

agent
all-agents    4050156
spider        1259040
Name: agent, dtype: int64

language
Chinese     637572
English     863028
French      637938
German      688080
Japanese    756156
Media       653310
Russian     566568
Spanish     506544
Name: language, dtype: int64

Exploratory Data Analysis (EDA)

Most visited page

In [18]:
print(df.shape)
df.head()
(5309196, 20)
Out[18]:
Page date visits month day quarter dayofweek dayofyear day_name month_name weekend weekday mean median name project access agent lang language
0 Sean_Connery_en.wikipedia.org_desktop_all-agents 2016-01-01 4872 1 1 1 4 1 Friday January False True 3405.661133 2624.0 Sean_Connery en.wikipedia.org desktop all-agents en English
1 Tableau_des_médailles_des_Jeux_olympiques_d'été_de_2008_fr.wikipedia.org_desktop_all-agents 2016-01-01 6 1 1 1 4 1 Friday January False True 170.841537 18.0 Tableau_des_médailles_des_Jeux_olympiques_d'été_de_2008 fr.wikipedia.org desktop all-agents fr French
2 The_Undertaker_fr.wikipedia.org_mobile-web_all-agents 2016-01-01 469 1 1 1 4 1 Friday January False True 400.336060 345.5 The_Undertaker fr.wikipedia.org mobile-web all-agents fr French
3 Category:Outdoor_sex_commons.wikimedia.org_all-access_all-agents 2016-01-01 142 1 1 1 4 1 Friday January False True 205.174866 193.0 Category:Outdoor_sex commons.wikimedia.org all-access all-agents commons Media
4 Камызяк_ru.wikipedia.org_all-access_all-agents 2016-01-01 6692 1 1 1 4 1 Friday January False True 912.516418 559.0 Камызяк ru.wikipedia.org all-access all-agents ru Russian
In [19]:
df['Page'].nunique() # there are 14.5k unique pages visited in 2016
Out[19]:
14506
In [20]:
df.groupby('Page')['visits'].sum()
Out[20]:
Page
"Keep_me_logged_in"_extended_to_one_year_www.mediawiki.org_mobile-web_all-agents        728
.bn_ru.wikipedia.org_desktop_all-agents                                               49200
007:_Спектр_ru.wikipedia.org_desktop_all-agents                                      308925
007_スペクター_ja.wikipedia.org_desktop_all-agents                                        219297
007:惡魔四伏_zh.wikipedia.org_desktop_all-agents                                         142666
                                                                                     ...   
龍涎香_zh.wikipedia.org_desktop_all-agents                                               56076
龍珠超_zh.wikipedia.org_all-access_spider                                                28444
龔嘉欣_zh.wikipedia.org_all-access_all-agents                                           178525
[Alexandros]_ja.wikipedia.org_all-access_spider                                       92253
[Alexandros]_ja.wikipedia.org_mobile-web_all-agents                                 1078434
Name: visits, Length: 14506, dtype: int32
In [154]:
df.groupby('Page')['visits'].sum().sort_values(ascending=False)
Out[154]:
Page
Special:Search_en.wikipedia.org_desktop_all-agents                                                                                                675606021
Wikipédia:Accueil_principal_fr.wikipedia.org_all-access_all-agents                                                                                581446666
Служебная:Поиск_ru.wikipedia.org_all-access_all-agents                                                                                             65811140
Spécial:Recherche_fr.wikipedia.org_all-access_all-agents                                                                                           64200822
Web_scraping_en.wikipedia.org_all-access_all-agents                                                                                                40047558
                                                                                                                                                    ...    
Nafri_de.wikipedia.org_desktop_all-agents                                                                                                                 0
Die_Ketzerbraut_(Film)_de.wikipedia.org_all-access_spider                                                                                                 0
User:46.188.28.235_commons.wikimedia.org_all-access_spider                                                                                                0
User:46.188.28.235_commons.wikimedia.org_all-access_all-agents                                                                                            0
Wikipedia_Diskussion:Meinungsbilder/keine_Bilder_in_Artikelnamensraum_von_direkt_abmahnenden_Fotografen_de.wikipedia.org_all-access_all-agents            0
Name: visits, Length: 14506, dtype: int32
In [153]:
df.groupby('Page')['visits'].sum().nlargest(5)
Out[153]:
Page
Special:Search_en.wikipedia.org_desktop_all-agents                    675606021
Wikipédia:Accueil_principal_fr.wikipedia.org_all-access_all-agents    581446666
Служебная:Поиск_ru.wikipedia.org_all-access_all-agents                 65811140
Spécial:Recherche_fr.wikipedia.org_all-access_all-agents               64200822
Web_scraping_en.wikipedia.org_all-access_all-agents                    40047558
Name: visits, dtype: int32
In [156]:
df.query(""" Page == 'Special:Search_en.wikipedia.org_desktop_all-agents' """).head()
Out[156]:
Page date visits month day quarter dayofweek dayofyear day_name month_name weekend weekday mean median name project access agent lang language
2297 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-01 1401667 1 1 1 4 1 Friday January False True 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
16803 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-02 1395136 1 2 1 5 2 Saturday January True False 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
31309 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-03 1455522 1 3 1 6 3 Sunday January True False 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
45815 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-04 1750373 1 4 1 0 4 Monday January False True 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
60321 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-05 1787494 1 5 1 1 5 Tuesday January False True 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English

Top 5 pages per language

In [13]:
# df.groupby('language')['visits'].nlargest(5)
In [14]:
# df.groupby('language')['visits'].apply(lambda x: x.nlargest(5))
In [15]:
# df.groupby('language')['visits'].apply(lambda x: x.nlargest(5).index)
In [16]:
df.groupby('language')['visits'].apply(lambda x: df.loc[x.nlargest(5).index])
Out[16]:
Page date visits month day quarter dayofweek dayofyear day_name month_name weekend weekday mean median name project access agent language
language
Chinese 3526717 緋彈的亞莉亞角色列表_zh.wikipedia.org_desktop_all-agents 2016-08-31 243557 8 31 3 2 244 Wednesday August False True 8.130765e+02 130.0 緋彈的亞莉亞角色列表 zh.wikipedia.org desktop all-agents Chinese
727287 九层妖塔_zh.wikipedia.org_desktop_all-agents 2016-02-20 181033 2 20 1 5 51 Saturday February True False 6.724918e+02 132.5 九层妖塔 zh.wikipedia.org desktop all-agents Chinese
2401631 伊是名島_zh.wikipedia.org_desktop_all-agents 2016-06-14 141544 6 14 2 1 166 Tuesday June False True 3.884699e+02 1.0 伊是名島 zh.wikipedia.org desktop all-agents Chinese
3585506 2016年香港立法會選舉_zh.wikipedia.org_all-access_all-agents 2016-09-04 99169 9 4 3 6 248 Sunday September True False 2.679394e+03 884.0 2016年香港立法會選舉 zh.wikipedia.org all-access all-agents Chinese
3600012 2016年香港立法會選舉_zh.wikipedia.org_all-access_all-agents 2016-09-05 98282 9 5 3 0 249 Monday September False True 2.679394e+03 884.0 2016年香港立法會選舉 zh.wikipedia.org all-access all-agents Chinese
English 2714919 Special:Search_en.wikipedia.org_desktop_all-agents 2016-07-06 16592075 7 6 3 2 188 Wednesday July False True 1.845918e+06 1700576.5 Special:Search en.wikipedia.org desktop all-agents English
3556267 Special:Search_en.wikipedia.org_desktop_all-agents 2016-09-02 7599524 9 2 3 4 246 Friday September False True 1.845918e+06 1700576.5 Special:Search en.wikipedia.org desktop all-agents English
3570773 Special:Search_en.wikipedia.org_desktop_all-agents 2016-09-03 6894531 9 3 3 5 247 Saturday September True False 1.845918e+06 1700576.5 Special:Search en.wikipedia.org desktop all-agents English
3541761 Special:Search_en.wikipedia.org_desktop_all-agents 2016-09-01 6878515 9 1 3 3 245 Thursday September False True 1.845918e+06 1700576.5 Special:Search en.wikipedia.org desktop all-agents English
3585279 Special:Search_en.wikipedia.org_desktop_all-agents 2016-09-04 6457072 9 4 3 6 248 Sunday September True False 1.845918e+06 1700576.5 Special:Search en.wikipedia.org desktop all-agents English
French 2163034 Wikipédia:Accueil_principal_fr.wikipedia.org_all-access_all-agents 2016-05-29 1845404 5 29 2 6 150 Sunday May True False 1.588652e+06 1601521.0 Wikipédia:Accueil_principal fr.wikipedia.org all-access all-agents French
2075998 Wikipédia:Accueil_principal_fr.wikipedia.org_all-access_all-agents 2016-05-23 1843295 5 23 2 0 144 Monday May False True 1.588652e+06 1601521.0 Wikipédia:Accueil_principal fr.wikipedia.org all-access all-agents French
2177540 Wikipédia:Accueil_principal_fr.wikipedia.org_all-access_all-agents 2016-05-30 1841301 5 30 2 0 151 Monday May False True 1.588652e+06 1601521.0 Wikipédia:Accueil_principal fr.wikipedia.org all-access all-agents French
1872914 Wikipédia:Accueil_principal_fr.wikipedia.org_all-access_all-agents 2016-05-09 1835599 5 9 2 0 130 Monday May False True 1.588652e+06 1601521.0 Wikipédia:Accueil_principal fr.wikipedia.org all-access all-agents French
1203163 Spécial:Connexion_fr.wikipedia.org_desktop_all-agents 2016-03-23 1830126 3 23 1 2 83 Wednesday March False True 4.561362e+04 15001.5 Spécial:Connexion fr.wikipedia.org desktop all-agents French
German 4439792 Gerätestecker_de.wikipedia.org_desktop_all-agents 2016-11-02 558381 11 2 4 2 307 Wednesday November False True 1.870743e+03 398.0 Gerätestecker de.wikipedia.org desktop all-agents German
2583806 Island_de.wikipedia.org_all-access_all-agents 2016-06-27 490526 6 27 2 0 179 Monday June False True 9.102489e+03 3276.5 Island de.wikipedia.org all-access all-agents German
2260103 Muhammad_Ali_de.wikipedia.org_all-access_all-agents 2016-06-04 427040 6 4 2 5 156 Saturday June True False 3.935115e+03 1219.5 Muhammad_Ali de.wikipedia.org all-access all-agents German
2586487 Island_de.wikipedia.org_mobile-web_all-agents 2016-06-27 387417 6 27 2 0 179 Monday June False True 5.569271e+03 1450.0 Island de.wikipedia.org mobile-web all-agents German
4570774 San_Marino_de.wikipedia.org_all-access_all-agents 2016-11-11 381926 11 11 4 4 316 Friday November False True 2.644194e+03 997.0 San_Marino de.wikipedia.org all-access all-agents German
Japanese 2724137 デイヴィッド・ロックフェラー_ja.wikipedia.org_all-access_all-agents 2016-07-06 1651272 7 6 3 2 188 Wednesday July False True 8.011582e+03 143.0 デイヴィッド・ロックフェラー ja.wikipedia.org all-access all-agents Japanese
2738643 デイヴィッド・ロックフェラー_ja.wikipedia.org_all-access_all-agents 2016-07-07 1196242 7 7 3 3 189 Thursday July False True 8.011582e+03 143.0 デイヴィッド・ロックフェラー ja.wikipedia.org all-access all-agents Japanese
1877171 ポラロイド_ja.wikipedia.org_all-access_all-agents 2016-05-09 1000484 5 9 2 0 130 Monday May False True 2.816202e+03 80.0 ポラロイド ja.wikipedia.org all-access all-agents Japanese
1871642 ポラロイド_ja.wikipedia.org_desktop_all-agents 2016-05-09 1000441 5 9 2 0 130 Monday May False True 2.780066e+03 44.0 ポラロイド ja.wikipedia.org desktop all-agents Japanese
621742 宮崎謙介_ja.wikipedia.org_all-access_all-agents 2016-02-12 707916 2 12 1 4 43 Friday February False True 6.258046e+03 338.0 宮崎謙介 ja.wikipedia.org all-access all-agents Japanese
Media 1865460 Parsoid/Developer_Setup_www.mediawiki.org_all-access_all-agents 2016-05-08 927825 5 8 2 6 129 Sunday May True False 1.275389e+04 60.0 Parsoid/Developer_Setup www.mediawiki.org all-access all-agents Media
1850954 Parsoid/Developer_Setup_www.mediawiki.org_all-access_all-agents 2016-05-07 917959 5 7 2 5 128 Saturday May True False 1.275389e+04 60.0 Parsoid/Developer_Setup www.mediawiki.org all-access all-agents Media
1879966 Parsoid/Developer_Setup_www.mediawiki.org_all-access_all-agents 2016-05-09 903318 5 9 2 0 130 Monday May False True 1.275389e+04 60.0 Parsoid/Developer_Setup www.mediawiki.org all-access all-agents Media
1836448 Parsoid/Developer_Setup_www.mediawiki.org_all-access_all-agents 2016-05-06 859422 5 6 2 4 127 Friday May False True 1.275389e+04 60.0 Parsoid/Developer_Setup www.mediawiki.org all-access all-agents Media
3552910 Extension:CiteThisPage_www.mediawiki.org_desktop_all-agents 2016-09-01 682334 9 1 3 3 245 Thursday September False True 4.062844e+03 18.0 Extension:CiteThisPage www.mediawiki.org desktop all-agents Media
Russian 4179962 Служебная:Поиск_ru.wikipedia.org_all-access_all-agents 2016-10-15 1412292 10 15 4 5 289 Saturday October True False 1.798119e+05 171580.0 Служебная:Поиск ru.wikipedia.org all-access all-agents Russian
4194468 Служебная:Поиск_ru.wikipedia.org_all-access_all-agents 2016-10-16 1107601 10 16 4 6 290 Sunday October True False 1.798119e+05 171580.0 Служебная:Поиск ru.wikipedia.org all-access all-agents Russian
2249807 Али,_Мохаммед_ru.wikipedia.org_all-access_all-agents 2016-06-04 562514 6 4 2 5 156 Saturday June True False 5.483262e+03 2232.0 Али,_Мохаммед ru.wikipedia.org all-access all-agents Russian
2833429 Toyota_Land_Cruiser_Prado_ru.wikipedia.org_all-access_all-agents 2016-07-14 559263 7 14 3 3 196 Thursday July False True 2.988973e+03 408.5 Toyota_Land_Cruiser_Prado ru.wikipedia.org all-access all-agents Russian
4165456 Служебная:Поиск_ru.wikipedia.org_all-access_all-agents 2016-10-14 524217 10 14 4 4 288 Friday October False True 1.798119e+05 171580.0 Служебная:Поиск ru.wikipedia.org all-access all-agents Russian
Spanish 2376682 Nilo_es.wikipedia.org_desktop_all-agents 2016-06-12 783454 6 12 2 6 164 Sunday June True False 2.780180e+03 628.5 Nilo es.wikipedia.org desktop all-agents Spanish
4790265 Fidel_Castro_es.wikipedia.org_mobile-web_all-agents 2016-11-26 524066 11 26 4 5 331 Saturday November True False 4.841872e+03 1485.5 Fidel_Castro es.wikipedia.org mobile-web all-agents Spanish
4275186 Asesinato_de_Luis_Donaldo_Colosio_Murrieta_es.wikipedia.org_all-access_all-agents 2016-10-21 506152 10 21 4 4 295 Friday October False True 1.782186e+03 349.0 Asesinato_de_Luis_Donaldo_Colosio_Murrieta es.wikipedia.org all-access all-agents Spanish
2058210 María_Teresa_Andruetto_es.wikipedia.org_desktop_all-agents 2016-05-21 502511 5 21 2 5 142 Saturday May True False 1.412732e+03 33.5 María_Teresa_Andruetto es.wikipedia.org desktop all-agents Spanish
144253 Especial:Entrar_es.wikipedia.org_desktop_all-agents 2016-01-10 497988 1 10 1 6 10 Sunday January True False 5.474865e+04 18132.5 Especial:Entrar es.wikipedia.org desktop all-agents Spanish

Data Visualizations

Single Timeseries Visualization

In [157]:
idx = df.groupby('Page')['visits'].sum().idxmax()
df.query(""" Page == @idx """).head()
Out[157]:
Page date visits month day quarter dayofweek dayofyear day_name month_name weekend weekday mean median name project access agent lang language
2297 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-01 1401667 1 1 1 4 1 Friday January False True 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
16803 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-02 1395136 1 2 1 5 2 Saturday January True False 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
31309 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-03 1455522 1 3 1 6 3 Sunday January True False 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
45815 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-04 1750373 1 4 1 0 4 Monday January False True 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
60321 Special:Search_en.wikipedia.org_desktop_all-agents 2016-01-05 1787494 1 5 1 1 5 Tuesday January False True 1845918.125 1700576.5 Special:Search en.wikipedia.org desktop all-agents en English
In [158]:
ts = df.query(""" Page == @idx """)[['date','visits']].set_index('date')

print(ts.shape)
ts.head()
(366, 1)
Out[158]:
visits
date
2016-01-01 1401667
2016-01-02 1395136
2016-01-03 1455522
2016-01-04 1750373
2016-01-05 1787494
In [159]:
ts.plot()

# ts is periodic
# ts has some very large peaks
# ts in not going upward, it does not have trend (it may have if I have more years)
Out[159]:
<matplotlib.axes._subplots.AxesSubplot at 0x1de52fcc0>
In [163]:
ts.groupby(ts.index.month).plot();

Language montly mean

In [17]:
df.head(2)
Out[17]:
Page date visits month day quarter dayofweek dayofyear day_name month_name weekend weekday mean median name project access agent language
0 Sean_Connery_en.wikipedia.org_desktop_all-agents 2016-01-01 4872 1 1 1 4 1 Friday January False True 3405.661133 2624.0 Sean_Connery en.wikipedia.org desktop all-agents English
1 Tableau_des_médailles_des_Jeux_olympiques_d'été_de_2008_fr.wikipedia.org_desktop_all-agents 2016-01-01 6 1 1 1 4 1 Friday January False True 170.841537 18.0 Tableau_des_médailles_des_Jeux_olympiques_d'été_de_2008 fr.wikipedia.org desktop all-agents French
In [21]:
fname_lang_monthly_mean = '../reports/figures/2016_sample001_monthly_visits.png'


if not os.path.isfile(fname_lang_monthly_mean):
    plt.figure(figsize=(12,12))
    sns.pointplot(x="month_name", y="visits", hue='language', data=df,estimator='mean')
    plt.savefig(fname_lang_monthly_mean, dpi=300)
In [19]:
Image(fname_lang_monthly_mean)
Out[19]:
In [20]:
# df.groupby(['month_name', 'language'])['visits'].sum().unstack().reset_index()
# TypeError: cannot insert an item into a CategoricalIndex that is not already an existing category
In [21]:
df.groupby(['month_name', 'language'])['visits'].mean().unstack()
Out[21]:
language Chinese English French German Japanese Media Russian Spanish
month_name
April 346.284022 3357.215861 1471.329241 499.983670 746.976186 121.054883 911.464255 1126.387741
August 356.399893 3577.341925 1388.243592 438.243154 770.811635 94.792048 678.857298 1020.159426
December 342.306655 3198.113191 1461.638758 511.450275 724.673610 104.160007 951.266525 1072.698443
February 323.293440 3753.439604 1526.610105 575.243269 798.092583 83.687144 1000.508064 1106.103324
January 320.215881 3532.314414 1549.011271 607.940031 836.786685 77.659781 989.145724 962.381643
July 349.729306 3393.508837 1406.123128 480.157773 823.521079 71.903840 721.329228 857.897445
June 326.015117 3175.392253 1490.617441 608.998475 738.780800 122.201494 810.115568 1070.707346
March 307.180716 2998.148075 1535.426017 531.052471 712.769806 89.373850 914.002021 1199.071742
May 315.227825 2982.470396 1538.262987 490.853380 791.984027 211.465600 877.348358 1122.429564
November 343.012246 3541.270469 1428.079633 595.995053 704.154969 77.519122 1023.325431 1260.698964
October 321.757231 3095.529577 1526.691263 535.785999 667.404647 76.556176 883.324435 1098.649380
September 343.960295 3436.207492 1465.977357 446.790869 735.662811 115.615686 764.727713 1081.434658
In [22]:
ax = df.groupby(['month_name', 'language'])['visits'].mean().unstack()\
.reset_index(drop=True).plot(figsize=(12,12), logy=False )

plt.xticks(range(12), rotation=90)
ax.set_xticklabels(df['month_name'].unique());
In [23]:
# exclude english and compare others
In [24]:
# plt.figure(figsize=(12,12))

# df.query("""language != 'English'""").pipe((sns.pointplot,'data'), x='month_name',
#                                                         y='visits',hue='language')
# # ValueError: 'c' argument has 12 elements, which is not acceptable for use with 'x' with size 0, 'y' with size 0.
In [25]:
ax = df.query("""language != 'English'""")\
.groupby(['month_name', 'language'])['visits'].mean().unstack()\
.reset_index(drop=True).plot(figsize=(12,12), logy=False )

plt.xticks(range(12), rotation=90)
ax.set_xticklabels(df['month_name'].unique());

Timeseries per language

In [22]:
df.groupby(['dayofyear', 'language'])['visits'].mean().unstack().plot(figsize=(12,12))
plt.savefig('../reports/figures/daily_visits.png',dpi=300)
In [28]:
%%time
# plt.figure(figsize=(12,12))
# sns.lineplot(x='dayofyear',y='visits',hue='language',data=df)

# Wall time: 2min 27s
CPU times: user 2min 10s, sys: 1.6 s, total: 2min 12s
Wall time: 2min 27s

Page Visits per Week Day

In [128]:
df['day_name'].unique().categories
Out[128]:
Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'],
      dtype='object')
In [23]:
%%time
sns.barplot(x='day_name',y='visits',data=df,
            order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday',
       'Sunday'])

plt.savefig('../reports/figures/visits_by_weekday.png')
CPU times: user 1min 18s, sys: 2.3 s, total: 1min 21s
Wall time: 1min 24s
In [77]:
%%time
df.groupby('day_name')['visits'].mean().sort_index().plot.bar()
CPU times: user 75.7 ms, sys: 6.9 ms, total: 82.6 ms
Wall time: 91.3 ms
Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x12582f668>
In [89]:
%%time
ax = df.groupby('day_name')['visits'].mean().sort_index(ascending=False)\
.plot.barh(use_index=True, color=sns.color_palette('husl',7))


plt.tick_params(axis='y', which='both', labelright='on')
CPU times: user 69 ms, sys: 8.23 ms, total: 77.3 ms
Wall time: 88.9 ms
In [99]:
df.groupby(['day_name','month_name'])['visits'].mean().sort_index(ascending=False).unstack().plot.bar()
Out[99]:
<matplotlib.axes._subplots.AxesSubplot at 0x12585d6d8>
In [24]:
%%time
sns.catplot(data=df, kind='bar',ci=None,
            x='day_name',y='visits',hue='month_name')
plt.xticks(rotation=90)

plt.savefig('../reports/figures/visits_per_month_per_weekday.png')
CPU times: user 17.3 s, sys: 1.6 s, total: 18.9 s
Wall time: 19.2 s
In [101]:
df.groupby(['day_name','month_name'])['visits'].mean().sort_index(ascending=False).unstack(0).plot.bar()
Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x123553ac8>

Page Visits per Month Day

In [25]:
df1 = df.groupby(["day_name", "day"])['visits'].mean().reset_index()\
.pivot('day','day_name','visits').dropna()

fig, ax = plt.subplots(figsize=(50, 30))
sns.heatmap(data=df1, annot=False, ax=ax, fmt="d", linewidths=2).invert_yaxis()
plt.title('Web Traffic per Days of Week',fontsize=28)

plt.xlabel('Week Day Name', fontsize=28)
plt.ylabel('Day of Month', fontsize=28)
plt.xticks(fontsize=28)
plt.yticks(fontsize=28)

cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=28)
plt.savefig('../reports/figures/visits_per_dayofmonth_per_dayofweek.png')
plt.show()
In [125]:
df1 = df.groupby(["month_name", "day"])['visits'].mean().reset_index()\
.pivot('day','month_name','visits').dropna()

fig, ax = plt.subplots(figsize=(50, 30))
# sns.set(font_scale=3)
sns.heatmap(data=df1, annot=False, ax=ax, fmt="d", linewidths=2).invert_yaxis()
plt.title('Web Traffic for Months per days of month',fontsize=28)

plt.xlabel('Month Name', fontsize=28)
plt.ylabel('Day of Month', fontsize=28)
plt.xticks(fontsize=28)
plt.yticks(fontsize=28)

cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=28)

plt.show()

Fast Fourier Transform (FFT)

Resources:

Here we can that the plots seems periodic in nature in time domain. We can work in the frequency domain using FFT transformation of the time series. Peaks in the FFT show us the strongest frequencies in the periodic signal.

The Fourier transform is an alternative representation of a signal as a superposition of periodic components. It is an important mathematical result that any well-behaved function can be represented under this form. Whereas a time-varying signal is most naturally considered as a function of time, the Fourier transform represents it as a function of the frequency. A magnitude and a phase, which are both encoded in a single complex number, are associated to each frequency.

The Discrete Fourier Transform
Let's consider a digital signal x represented by a vector $(x0,...,xN−1)$. We assume that this signal is regularly sampled. The Discrete Fourier Transform (DFT) of x is $X=(X0,...,XN−1)$ defined as:

$$ \forall k \in\{0, \ldots, N-1\}, \quad X_{k}=\sum_{n=0}^{N-1} x_{n} e^{-2\ i \pi k\ n / N} $$

The DFT can be computed efficiently with the Fast Fourier Transform (FFT), an algorithm that exploits symmetries and redundancies in this definition to considerably speed up the computation. The complexity of the FFT is $O(NlogN)$ instead of $O(N^2)$ for the naive DFT. The FFT is one of the most important algorithms of the digital universe.

In [43]:
days = df['dayofyear'].unique()
In [40]:
df_daily = df.groupby(['dayofyear', 'language'])['visits'].mean().unstack()
df_daily.head()
Out[40]:
language Chinese English French German Japanese Media Russian Spanish
dayofyear
1 357.400689 3007.226039 1435.950086 654.809043 865.126815 59.282913 870.183463 480.771676
2 338.280712 3498.513995 1510.561675 609.202660 1133.100678 75.859944 936.125323 579.312861
3 377.146383 3287.531807 1606.372920 685.334574 884.638916 66.575910 1013.076873 606.973266
4 292.405281 3173.361323 1598.381526 621.768617 783.205227 72.059384 930.982558 655.987717
5 285.670494 3073.982188 1600.864601 561.303723 826.985963 74.003361 900.455426 641.997110
In [41]:
df_daily.columns
Out[41]:
CategoricalIndex(['Chinese', 'English', 'French', 'German', 'Japanese',
                  'Media', 'Russian', 'Spanish'],
                 categories=['Chinese', 'English', 'French', 'German', 'Japanese', 'Media', 'Russian', 'Spanish'], ordered=False, name='language', dtype='category')
In [44]:
def plot_with_fft(df_daily, col):
    from scipy.fftpack import fft
    
    fig = plt.figure(1,figsize=[15,5])
    plt.ylabel('Views per Page')
    plt.xlabel('Day')
    
    plt.title(col)
    plt.xticks(range(0,370,10))
    plt.plot(days, df_daily[col].to_numpy(),label = col )
    
    fig = plt.figure(2,figsize=[15,5])
    fft_complex = fft(df_daily[col].to_numpy())
    
    fft_mag = [np.sqrt(np.real(x)*np.real(x)+np.imag(x)*np.imag(x))
               for x in fft_complex]
    
    fft_xvals = [day / days[-1] for day in days]
    npts = len(fft_xvals) // 2 + 1
    fft_mag = fft_mag[:npts]
    fft_xvals = fft_xvals[:npts]
        
    plt.ylabel('FFT Magnitude')
    plt.xlabel(r"Frequency [days]$^{-1}$")
    plt.title('Fourier Transform')
    plt.plot(fft_xvals[1:],fft_mag[1:],label = col )
    # Draw lines at 1, 1/2, and 1/3 week periods
    plt.axvline(x=1./7,color='red',alpha=0.3)
    plt.axvline(x=2./7,color='red',alpha=0.3)
    plt.axvline(x=3./7,color='red',alpha=0.3)

    plt.show()
    
for col in df_daily.columns:
    plot_with_fft(df_daily, col)
In [ ]: